In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy
import pandasql as ps
from pandasql import sqldf
mysql = lambda q: sqldf(q, globals())
In [2]:
dim_match_summary = pd.read_csv('dim_match_summary.csv')
print(dim_match_summary.info())
dim_match_summary.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 45 entries, 0 to 44 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 team1 45 non-null object 1 team2 45 non-null object 2 winner 45 non-null object 3 margin 41 non-null object 4 ground 45 non-null object 5 matchDate 45 non-null object 6 match_id 45 non-null object dtypes: object(7) memory usage: 2.6+ KB None
Out[2]:
| team1 | team2 | winner | margin | ground | matchDate | match_id | |
|---|---|---|---|---|---|---|---|
| 0 | Namibia | Sri Lanka | Namibia | 55 runs | Geelong | 16-Oct-22 | T20I # 1823 |
| 1 | Netherlands | U.A.E. | Netherlands | 3 wickets | Geelong | 16-Oct-22 | T20I # 1825 |
| 2 | Scotland | West Indies | Scotland | 42 runs | Hobart | 17-Oct-22 | T20I # 1826 |
| 3 | Ireland | Zimbabwe | Zimbabwe | 31 runs | Hobart | 17-Oct-22 | T20I # 1828 |
| 4 | Namibia | Netherlands | Netherlands | 5 wickets | Geelong | 18-Oct-22 | T20I # 1830 |
In [3]:
#Converting the matchdate column into Date format
dim_match_summary['matchDate'] = pd.to_datetime(dim_match_summary['matchDate'], format='%d-%b-%y')
#Removing T20I# from the matchId
dim_match_summary['matchId'] = dim_match_summary['match_id'].str.extract(r'#\s(\d+[a-zA-Z]*)')
#Replacing 'no result' and 'abandoned' with 'no result' in the winner column
dim_match_summary.loc[dim_match_summary['winner'].isin(['no result', 'abandoned']), 'winner'] = 'no result'
#Removing extra spaces and converting into lower
dim_match_summary['team1'] =dim_match_summary['team1'].astype(str).str.strip().str.lower()
dim_match_summary['team2'] =dim_match_summary['team2'].astype(str).str.strip().str.lower()
dim_match_summary['winner'] =dim_match_summary['winner'].astype(str).str.strip().str.lower()
dim_match_summary['ground'] =dim_match_summary['ground'].astype(str).str.strip().str.lower()
# Converting teamInnings and batsmanName to Proper case
dim_match_summary['team1'] =dim_match_summary['team1'].str.title()
dim_match_summary['team2'] =dim_match_summary['team2'].str.title()
dim_match_summary['winner'] =dim_match_summary['winner'].str.title()
dim_match_summary['ground'] =dim_match_summary['ground'].str.title()
# Set margin to null for no result matches
dim_match_summary.loc[dim_match_summary['winner'] == 'no result', 'margin'] = None
#Checking consistency
same_team_check = dim_match_summary[dim_match_summary['team1'] == dim_match_summary['team2']]
# Display rows where team1 and team2 are the same
if same_team_check.empty:
print("All rows have different teams for 'team1' and 'team2'.")
else:
print("There are rows where 'team1' and 'team2' are the same:")
print(same_team_check)
print(dim_match_summary.duplicated().sum())
dim_match_summary.drop(columns='match_id',inplace=True)
dim_match_summary.to_csv('match_summary.csv', index=False)
dim_match_summary
All rows have different teams for 'team1' and 'team2'. 0
Out[3]:
| team1 | team2 | winner | margin | ground | matchDate | matchId | |
|---|---|---|---|---|---|---|---|
| 0 | Namibia | Sri Lanka | Namibia | 55 runs | Geelong | 2022-10-16 | 1823 |
| 1 | Netherlands | U.A.E. | Netherlands | 3 wickets | Geelong | 2022-10-16 | 1825 |
| 2 | Scotland | West Indies | Scotland | 42 runs | Hobart | 2022-10-17 | 1826 |
| 3 | Ireland | Zimbabwe | Zimbabwe | 31 runs | Hobart | 2022-10-17 | 1828 |
| 4 | Namibia | Netherlands | Netherlands | 5 wickets | Geelong | 2022-10-18 | 1830 |
| 5 | Sri Lanka | U.A.E. | Sri Lanka | 79 runs | Geelong | 2022-10-18 | 1832 |
| 6 | Ireland | Scotland | Ireland | 6 wickets | Hobart | 2022-10-19 | 1833 |
| 7 | West Indies | Zimbabwe | West Indies | 31 runs | Hobart | 2022-10-19 | 1834 |
| 8 | Netherlands | Sri Lanka | Sri Lanka | 16 runs | Geelong | 2022-10-20 | 1835 |
| 9 | Namibia | U.A.E. | U.A.E. | 7 runs | Geelong | 2022-10-20 | 1836 |
| 10 | Ireland | West Indies | Ireland | 9 wickets | Hobart | 2022-10-21 | 1837 |
| 11 | Scotland | Zimbabwe | Zimbabwe | 5 wickets | Hobart | 2022-10-21 | 1838 |
| 12 | Australia | New Zealand | New Zealand | 89 runs | Sydney | 2022-10-22 | 1839 |
| 13 | Afghanistan | England | England | 5 wickets | Perth | 2022-10-22 | 1840 |
| 14 | Ireland | Sri Lanka | Sri Lanka | 9 wickets | Hobart | 2022-10-23 | 1841 |
| 15 | India | Pakistan | India | 4 wickets | Melbourne | 2022-10-23 | 1842 |
| 16 | Bangladesh | Netherlands | Bangladesh | 9 runs | Hobart | 2022-10-24 | 1843 |
| 17 | South Africa | Zimbabwe | No Result | NaN | Hobart | 2022-10-24 | 1844 |
| 18 | Australia | Sri Lanka | Australia | 7 wickets | Perth | 2022-10-25 | 1845 |
| 19 | England | Ireland | Ireland | 5 runs | Melbourne | 2022-10-26 | 1846 |
| 20 | Afghanistan | New Zealand | No Result | NaN | Melbourne | 2022-10-26 | 1846a |
| 21 | Bangladesh | South Africa | South Africa | 104 runs | Sydney | 2022-10-27 | 1847 |
| 22 | India | Netherlands | India | 56 runs | Sydney | 2022-10-27 | 1848 |
| 23 | Pakistan | Zimbabwe | Zimbabwe | 1 run | Perth | 2022-10-27 | 1849 |
| 24 | Afghanistan | Ireland | No Result | NaN | Melbourne | 2022-10-28 | 1849a |
| 25 | Australia | England | No Result | NaN | Melbourne | 2022-10-28 | 1849b |
| 26 | New Zealand | Sri Lanka | New Zealand | 65 runs | Sydney | 2022-10-29 | 1850 |
| 27 | Bangladesh | Zimbabwe | Bangladesh | 3 runs | Brisbane | 2022-10-30 | 1851 |
| 28 | Netherlands | Pakistan | Pakistan | 6 wickets | Perth | 2022-10-30 | 1852 |
| 29 | India | South Africa | South Africa | 5 wickets | Perth | 2022-10-30 | 1853 |
| 30 | Australia | Ireland | Australia | 42 runs | Brisbane | 2022-10-31 | 1855 |
| 31 | Afghanistan | Sri Lanka | Sri Lanka | 6 wickets | Brisbane | 2022-11-01 | 1856 |
| 32 | England | New Zealand | England | 20 runs | Brisbane | 2022-11-01 | 1858 |
| 33 | Netherlands | Zimbabwe | Netherlands | 5 wickets | Adelaide | 2022-11-02 | 1859 |
| 34 | Bangladesh | India | India | 5 runs | Adelaide | 2022-11-02 | 1860 |
| 35 | Pakistan | South Africa | Pakistan | 33 runs | Sydney | 2022-11-03 | 1861 |
| 36 | Ireland | New Zealand | New Zealand | 35 runs | Adelaide | 2022-11-04 | 1862 |
| 37 | Australia | Afghanistan | Australia | 4 runs | Adelaide | 2022-11-04 | 1864 |
| 38 | England | Sri Lanka | England | 4 wickets | Sydney | 2022-11-05 | 1867 |
| 39 | Netherlands | South Africa | Netherlands | 13 runs | Adelaide | 2022-11-06 | 1871 |
| 40 | Bangladesh | Pakistan | Pakistan | 5 wickets | Adelaide | 2022-11-06 | 1872 |
| 41 | India | Zimbabwe | India | 71 runs | Melbourne | 2022-11-06 | 1873 |
| 42 | New Zealand | Pakistan | Pakistan | 7 wickets | Sydney | 2022-11-09 | 1877 |
| 43 | England | India | England | 10 wickets | Adelaide | 2022-11-10 | 1878 |
| 44 | England | Pakistan | England | 5 wickets | Melbourne | 2022-11-13 | 1879 |
In [4]:
dim_players = pd.read_csv('dim_players.csv')
print(dim_players.info())
dim_players.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 219 entries, 0 to 218 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 219 non-null object 1 team 219 non-null object 2 battingStyle 219 non-null object 3 bowlingStyle 199 non-null object 4 playingRole 219 non-null object dtypes: object(5) memory usage: 8.7+ KB None
Out[4]:
| name | team | battingStyle | bowlingStyle | playingRole | |
|---|---|---|---|---|---|
| 0 | Najmul Hossain Shanto | Bangladesh | Left hand Bat | Right arm Offbreak | Top order Batter |
| 1 | Soumya Sarkar | Bangladesh | Left hand Bat | Right arm Medium fast | Middle order Batter |
| 2 | Litton Das | Bangladesh | Right hand Bat | NaN | Wicketkeeper Batter |
| 3 | Shakib Al Hasan(c) | Bangladesh | Left hand Bat | Slow Left arm Orthodox | Allrounder |
| 4 | Afif Hossain | Bangladesh | Left hand Bat | Right arm Offbreak | Allrounder |
In [5]:
import pandas as pd
dim_players = pd.read_csv('dim_players.csv')
# Filling NaN values with specified defaults
dim_players['bowlingStyle'].fillna('None', inplace=True)
dim_players['battingStyle'].fillna('None', inplace=True)
dim_players['playingRole'].fillna('Unknown', inplace=True)
# Converting teamInnings and batsmanName to Proper case
dim_players['name'] = dim_players['name'].astype(str).str.strip().str.lower().str.title()
dim_players['playingRole'] = dim_players['playingRole'].str.strip().str.lower().str.title()
dim_players['team'] = dim_players['team'].str.strip().str.lower().str.title()
dim_players['battingStyle'] = dim_players['battingStyle'].str.strip().str.lower().str.title()
dim_players['bowlingStyle'] = dim_players['bowlingStyle'].str.strip().str.lower().str.title()
# Checking for duplicates in the DataFrame
print(f"Number of duplicate rows: {dim_players.duplicated().sum()}")
dim_players.to_csv('Infoplayers.csv', index=False)
dim_players.head(30)
Number of duplicate rows: 0
Out[5]:
| name | team | battingStyle | bowlingStyle | playingRole | |
|---|---|---|---|---|---|
| 0 | Najmul Hossain Shanto | Bangladesh | Left Hand Bat | Right Arm Offbreak | Top Order Batter |
| 1 | Soumya Sarkar | Bangladesh | Left Hand Bat | Right Arm Medium Fast | Middle Order Batter |
| 2 | Litton Das | Bangladesh | Right Hand Bat | None | Wicketkeeper Batter |
| 3 | Shakib Al Hasan(C) | Bangladesh | Left Hand Bat | Slow Left Arm Orthodox | Allrounder |
| 4 | Afif Hossain | Bangladesh | Left Hand Bat | Right Arm Offbreak | Allrounder |
| 5 | Mosaddek Hossain | Bangladesh | Right Hand Bat | Right Arm Offbreak | Middle Order Batter |
| 6 | Nurul Hasan | Bangladesh | Right Hand Bat | None | Wicketkeeper Batter |
| 7 | Yasir Ali | Bangladesh | Right Hand Bat | Right Arm Offbreak | Middle Order Batter |
| 8 | Wessly Madhevere | Zimbabwe | Right Hand Bat | Right Arm Offbreak | Allrounder |
| 9 | Craig Ervine(C) | Zimbabwe | Left Hand Bat | Right Arm Offbreak | Middle Order Batter |
| 10 | Milton Shumba | Zimbabwe | Left Hand Bat | Slow Left Arm Orthodox | Top Order Batter |
| 11 | Sean Williams | Zimbabwe | Left Hand Bat | Slow Left Arm Orthodox | Middle Order Batter |
| 12 | Sikandar Raza | Zimbabwe | Right Hand Bat | Right Arm Offbreak | Batting Allrounder |
| 13 | Regis Chakabva | Zimbabwe | Right Hand Bat | Right Arm Offbreak | Wicketkeeper Batter |
| 14 | Ryan Burl | Zimbabwe | Left Hand Bat | Legbreak | Middle Order Batter |
| 15 | Brad Evans | Zimbabwe | Right Hand Bat | Right Arm Fast | Allrounder |
| 16 | Richard Ngarava | Zimbabwe | Left Hand Bat | Left Arm Fast Medium | Bowler |
| 17 | Blessing Muzarabani | Zimbabwe | Right Hand Bat | Right Arm Fast Medium | Bowler |
| 18 | Tendai Chatara | Zimbabwe | Right Hand Bat | Right Arm Fast Medium | Bowler |
| 19 | Taskin Ahmed | Bangladesh | Left Hand Bat | Right Arm Fast | Bowler |
| 20 | Hasan Mahmud | Bangladesh | Right Hand Bat | Right Arm Medium | Bowler |
| 21 | Mustafizur Rahman | Bangladesh | Left Hand Bat | Left Arm Fast Medium | Bowler |
| 22 | Shakib Al Hasan | Bangladesh | Left Hand Bat | Slow Left Arm Orthodox | Allrounder |
| 23 | Nasum Ahmed | Bangladesh | Left Hand Bat | Slow Left Arm Orthodox | Bowler |
| 24 | Mohammad Rizwan | Pakistan | Right Hand Bat | None | Wicketkeeper Batter |
| 25 | Babar Azam(C) | Pakistan | Right Hand Bat | Right Arm Offbreak | Batter |
| 26 | Mohammad Nawaz | Pakistan | Left Hand Bat | Slow Left Arm Orthodox | Allrounder |
| 27 | Mohammad Haris | Pakistan | Right Hand Bat | Right Arm Offbreak | Middle Order Batter |
| 28 | Shan Masood | Pakistan | Left Hand Bat | Right Arm Medium Fast | Opening Batter |
| 29 | Iftikhar Ahmed | Pakistan | Right Hand Bat | Right Arm Offbreak | Middle Order Batter |
In [6]:
fact_bowling_summary = pd.read_csv('fact_bowling_summary.csv')
print(fact_bowling_summary.info())
fact_bowling_summary.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 match 500 non-null object 1 bowlingTeam 500 non-null object 2 bowlerName 500 non-null object 3 overs 500 non-null float64 4 maiden 500 non-null int64 5 runs 500 non-null int64 6 wickets 500 non-null int64 7 economy 500 non-null float64 8 0s 500 non-null int64 9 4s 500 non-null int64 10 6s 500 non-null int64 11 wides 500 non-null int64 12 noBalls 500 non-null int64 13 match_id 500 non-null object dtypes: float64(2), int64(8), object(4) memory usage: 54.8+ KB None
Out[6]:
| match | bowlingTeam | bowlerName | overs | maiden | runs | wickets | economy | 0s | 4s | 6s | wides | noBalls | match_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Namibia Vs Sri Lanka | Sri Lanka | Maheesh Theekshana | 4.0 | 0 | 23 | 1 | 5.75 | 7 | 0 | 0 | 2 | 0 | T20I # 1823 |
| 1 | Namibia Vs Sri Lanka | Sri Lanka | Dushmantha Chameera | 4.0 | 0 | 39 | 1 | 9.75 | 6 | 3 | 1 | 2 | 0 | T20I # 1823 |
| 2 | Namibia Vs Sri Lanka | Sri Lanka | Pramod Madushan | 4.0 | 0 | 37 | 2 | 9.25 | 6 | 3 | 1 | 0 | 0 | T20I # 1823 |
| 3 | Namibia Vs Sri Lanka | Sri Lanka | Chamika Karunaratne | 4.0 | 0 | 36 | 1 | 9.00 | 7 | 3 | 1 | 1 | 0 | T20I # 1823 |
| 4 | Namibia Vs Sri Lanka | Sri Lanka | Wanindu Hasaranga de Silva | 4.0 | 0 | 27 | 1 | 6.75 | 8 | 1 | 1 | 0 | 0 | T20I # 1823 |
In [7]:
#Removing T20I# from the matchId
fact_bowling_summary['matchId'] = fact_bowling_summary['match_id'].str.extract(r'#\s(\d+[a-zA-Z]*)')
#Calculating Overall Extras Conceded
fact_bowling_summary['extras'] = fact_bowling_summary['wides'] + fact_bowling_summary['noBalls']
#Removing extra spaces and converting into lower
fact_bowling_summary['bowlingTeam'] = fact_bowling_summary['bowlingTeam'].astype(str).str.strip().str.lower()
fact_bowling_summary['bowlerName'] = fact_bowling_summary['bowlerName'].astype(str).str.strip().str.lower()
#Converting teamInnings and batsmanName to Proper case
fact_bowling_summary['bowlingTeam'] = fact_bowling_summary['bowlingTeam'].str.title()
fact_bowling_summary['bowlerName'] = fact_bowling_summary['bowlerName'].str.title()
#Detecting outliers in the economy
outliers = fact_bowling_summary[
(fact_bowling_summary['economy'] > 36) |
(fact_bowling_summary['overs'] > 4.0) |
(fact_bowling_summary['wickets'] > 10) |
(fact_bowling_summary['maiden'] > 4)
]
if outliers.empty:
print("No Oultiers")
else:
print("Outliers Detected:\n", outliers)
q="""SELECT bowlerName,runs,economy,
(runs / economy) AS actual_overs_bowled
FROM
outliers;
"""
actual_overs=mysql(q)
actual_overs
Outliers Detected:
match bowlingTeam bowlerName overs maiden runs \
12 U.A.E. Vs Netherlands Netherlands Tim Pringle 5.0 0 13
wickets economy 0s 4s 6s wides noBalls match_id matchId \
12 1 3.25 11 0 0 0 0 T20I # 1825 1825
extras
12 0
Out[7]:
| bowlerName | runs | economy | actual_overs_bowled | |
|---|---|---|---|---|
| 0 | Tim Pringle | 13 | 3.25 | 4.0 |
In [8]:
fact_bowling_summary.loc[
(fact_bowling_summary['matchId'] == '1825') &
(fact_bowling_summary['bowlerName'] == 'Tim Pringle'),
'overs'
] =actual_overs['actual_overs_bowled'][0]
updated_row = fact_bowling_summary[
(fact_bowling_summary['matchId'] == '1825') &
(fact_bowling_summary['bowlerName'] == 'Tim Pringle')
]
updated_row
Out[8]:
| match | bowlingTeam | bowlerName | overs | maiden | runs | wickets | economy | 0s | 4s | 6s | wides | noBalls | match_id | matchId | extras | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | U.A.E. Vs Netherlands | Netherlands | Tim Pringle | 4.0 | 0 | 13 | 1 | 3.25 | 11 | 0 | 0 | 0 | 0 | T20I # 1825 | 1825 | 0 |
In [9]:
#Calculating Number of balls bowled
fact_bowling_summary['total_balls_bowled'] = (fact_bowling_summary['overs'].astype(int) * 6) + ((fact_bowling_summary['overs'] % 1) * 10).astype(int)
print(f"Number of duplicate rows: {fact_bowling_summary.duplicated().sum()}")
fact_bowling_summary.drop(columns='match_id',inplace=True)
fact_bowling_summary.to_csv('bowling_summary.csv', index=False)
fact_bowling_summary
Number of duplicate rows: 0
Out[9]:
| match | bowlingTeam | bowlerName | overs | maiden | runs | wickets | economy | 0s | 4s | 6s | wides | noBalls | matchId | extras | total_balls_bowled | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Namibia Vs Sri Lanka | Sri Lanka | Maheesh Theekshana | 4.0 | 0 | 23 | 1 | 5.75 | 7 | 0 | 0 | 2 | 0 | 1823 | 2 | 24 |
| 1 | Namibia Vs Sri Lanka | Sri Lanka | Dushmantha Chameera | 4.0 | 0 | 39 | 1 | 9.75 | 6 | 3 | 1 | 2 | 0 | 1823 | 2 | 24 |
| 2 | Namibia Vs Sri Lanka | Sri Lanka | Pramod Madushan | 4.0 | 0 | 37 | 2 | 9.25 | 6 | 3 | 1 | 0 | 0 | 1823 | 0 | 24 |
| 3 | Namibia Vs Sri Lanka | Sri Lanka | Chamika Karunaratne | 4.0 | 0 | 36 | 1 | 9.00 | 7 | 3 | 1 | 1 | 0 | 1823 | 1 | 24 |
| 4 | Namibia Vs Sri Lanka | Sri Lanka | Wanindu Hasaranga De Silva | 4.0 | 0 | 27 | 1 | 6.75 | 8 | 1 | 1 | 0 | 0 | 1823 | 0 | 24 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 495 | Pakistan Vs England | Pakistan | Naseem Shah | 4.0 | 0 | 30 | 0 | 7.50 | 15 | 3 | 1 | 1 | 0 | 1879 | 1 | 24 |
| 496 | Pakistan Vs England | Pakistan | Haris Rauf | 4.0 | 0 | 23 | 2 | 5.75 | 13 | 3 | 0 | 1 | 0 | 1879 | 1 | 24 |
| 497 | Pakistan Vs England | Pakistan | Shadab Khan | 4.0 | 0 | 20 | 1 | 5.00 | 10 | 1 | 0 | 0 | 0 | 1879 | 0 | 24 |
| 498 | Pakistan Vs England | Pakistan | Mohammad Wasim | 4.0 | 0 | 38 | 1 | 9.50 | 5 | 5 | 0 | 2 | 0 | 1879 | 2 | 24 |
| 499 | Pakistan Vs England | Pakistan | Iftikhar Ahmed | 0.5 | 0 | 13 | 0 | 15.60 | 0 | 1 | 1 | 0 | 0 | 1879 | 0 | 5 |
500 rows × 16 columns
In [10]:
fact_batting_summary = pd.read_csv('fact_bating_summary.csv')
print(fact_batting_summary.info())
fact_batting_summary.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 699 entries, 0 to 698 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 match 699 non-null object 1 teamInnings 699 non-null object 2 battingPos 699 non-null int64 3 batsmanName 699 non-null object 4 runs 699 non-null int64 5 balls 699 non-null int64 6 4s 699 non-null int64 7 6s 699 non-null int64 8 SR 699 non-null object 9 out/not_out 699 non-null object 10 match_id 699 non-null object dtypes: int64(5), object(6) memory usage: 60.2+ KB None
Out[10]:
| match | teamInnings | battingPos | batsmanName | runs | balls | 4s | 6s | SR | out/not_out | match_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Namibia Vs Sri Lanka | Namibia | 1 | Michael van Lingen | 3 | 6 | 0 | 0 | 50 | out | T20I # 1823 |
| 1 | Namibia Vs Sri Lanka | Namibia | 2 | Divan la Cock | 9 | 9 | 1 | 0 | 100 | out | T20I # 1823 |
| 2 | Namibia Vs Sri Lanka | Namibia | 3 | Jan Nicol Loftie-Eaton | 20 | 12 | 1 | 2 | 166.66 | out | T20I # 1823 |
| 3 | Namibia Vs Sri Lanka | Namibia | 4 | Stephan Baard | 26 | 24 | 2 | 0 | 108.33 | out | T20I # 1823 |
| 4 | Namibia Vs Sri Lanka | Namibia | 5 | Gerhard Erasmus(c) | 20 | 24 | 0 | 0 | 83.33 | out | T20I # 1823 |
In [11]:
#Removing T20I# from the matchId
fact_batting_summary['matchId'] = fact_batting_summary['match_id'].str.extract(r'#\s(\d+[a-zA-Z]*)')
#Converting SR to Float Type
fact_batting_summary['SR'] = fact_batting_summary['SR'].replace({'-': np.nan}).fillna(0.00).astype(float).round(2)
#Removing extra spaces and converting into lower
fact_batting_summary['teamInnings'] = fact_batting_summary['teamInnings'].astype(str).str.strip().str.lower()
fact_batting_summary['batsmanName'] = fact_batting_summary['batsmanName'].astype(str).str.strip().str.lower()
# Converting teamInnings and batsmanName to Proper case
fact_batting_summary['teamInnings'] = fact_batting_summary['teamInnings'].str.title()
fact_batting_summary['batsmanName'] = fact_batting_summary['batsmanName'].str.title()
# Converting out/not_out to binary
print(fact_batting_summary['out/not_out'].unique())
fact_batting_summary['out/not_out'] = fact_batting_summary['out/not_out'].astype(str).str.strip().str.lower()
fact_batting_summary['out/not_out'] = fact_batting_summary['out/not_out'].map({'out': 1, 'not_out': 0})
# Checking Outliers
normal_outliers = fact_batting_summary[
(fact_batting_summary['runs'] < 0) |
(fact_batting_summary['balls'] < 0) |
(fact_batting_summary['4s'] < 0) |(fact_batting_summary['4s'] > fact_batting_summary['balls']) |
(fact_batting_summary['6s'] < 0) |(fact_batting_summary['6s'] > fact_batting_summary['balls']) |
(fact_batting_summary['SR'] < 0)
]
if normal_outliers.empty:
print("No Normal Oultiers")
else:
print("Outliers Detected:\n",normal_outliers)
#Checking SR outliers
fact_batting_summary['calculated_SR'] = np.where(
fact_batting_summary['balls'] > 0,
(fact_batting_summary['runs'] / fact_batting_summary['balls']) * 100,
0
)
strike_rate_outlier = fact_batting_summary[
(fact_batting_summary['balls'] > 0) &
(abs(fact_batting_summary['SR'] - fact_batting_summary['calculated_SR']) > 1)
]
print(strike_rate_outlier)
q="""SELECT *,
ROUND((SR * balls) / 100) AS calculated_runs
FROM strike_rate_outlier;
"""
SR_outlier=mysql(q)
SR_outlier
['out' 'not_out' 'Out' 'not_Out' 'out ' 'not_out ']
No Normal Oultiers
match teamInnings battingPos batsmanName runs \
263 Pakistan Vs India India 5 Axar Patel 62
407 Bangladesh Vs Zimbabwe Zimbabwe 1 Wessly Madhevere 74
balls 4s 6s SR out/not_out match_id matchId calculated_SR
263 3 0 0 66.66 1 T20I # 1842 1842 2066.666667
407 3 1 0 133.33 1 T20I # 1851 1851 2466.666667
Out[11]:
| match | teamInnings | battingPos | batsmanName | runs | balls | 4s | 6s | SR | out/not_out | match_id | matchId | calculated_SR | calculated_runs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Pakistan Vs India | India | 5 | Axar Patel | 62 | 3 | 0 | 0 | 66.66 | 1 | T20I # 1842 | 1842 | 2066.666667 | 2.0 |
| 1 | Bangladesh Vs Zimbabwe | Zimbabwe | 1 | Wessly Madhevere | 74 | 3 | 1 | 0 | 133.33 | 1 | T20I # 1851 | 1851 | 2466.666667 | 4.0 |
In [12]:
fact_batting_summary.drop(columns=['match_id','calculated_SR'],inplace=True)
for index, row in SR_outlier.iterrows():
fact_batting_summary.loc[
(fact_batting_summary['matchId'] == row['matchId']) &
(fact_batting_summary['teamInnings'] == row['teamInnings']) &
(fact_batting_summary['battingPos'] == row['battingPos']) &
(fact_batting_summary['batsmanName'] == row['batsmanName']),
'runs'
] = row['calculated_runs']
# Verify the updated rows
updated_rows = fact_batting_summary[
(fact_batting_summary['matchId'].isin(SR_outlier['matchId'])) &
(fact_batting_summary['batsmanName'].isin(SR_outlier['batsmanName']))
]
updated_rows
Out[12]:
| match | teamInnings | battingPos | batsmanName | runs | balls | 4s | 6s | SR | out/not_out | matchId | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 263 | Pakistan Vs India | India | 5 | Axar Patel | 2 | 3 | 0 | 0 | 66.66 | 1 | 1842 |
| 407 | Bangladesh Vs Zimbabwe | Zimbabwe | 1 | Wessly Madhevere | 4 | 3 | 1 | 0 | 133.33 | 1 | 1851 |
In [13]:
fact_batting_summary.to_csv('batting_summary.csv', index=False)
fact_batting_summary
Out[13]:
| match | teamInnings | battingPos | batsmanName | runs | balls | 4s | 6s | SR | out/not_out | matchId | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Namibia Vs Sri Lanka | Namibia | 1 | Michael Van Lingen | 3 | 6 | 0 | 0 | 50.00 | 1 | 1823 |
| 1 | Namibia Vs Sri Lanka | Namibia | 2 | Divan La Cock | 9 | 9 | 1 | 0 | 100.00 | 1 | 1823 |
| 2 | Namibia Vs Sri Lanka | Namibia | 3 | Jan Nicol Loftie-Eaton | 20 | 12 | 1 | 2 | 166.66 | 1 | 1823 |
| 3 | Namibia Vs Sri Lanka | Namibia | 4 | Stephan Baard | 26 | 24 | 2 | 0 | 108.33 | 1 | 1823 |
| 4 | Namibia Vs Sri Lanka | Namibia | 5 | Gerhard Erasmus(C) | 20 | 24 | 0 | 0 | 83.33 | 1 | 1823 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 694 | Pakistan Vs England | England | 3 | Phil Salt | 10 | 9 | 2 | 0 | 111.11 | 1 | 1879 |
| 695 | Pakistan Vs England | England | 4 | Ben Stokes | 52 | 49 | 5 | 1 | 106.12 | 0 | 1879 |
| 696 | Pakistan Vs England | England | 5 | Harry Brook | 20 | 23 | 1 | 0 | 86.95 | 1 | 1879 |
| 697 | Pakistan Vs England | England | 6 | Moeen Ali | 19 | 13 | 3 | 0 | 146.15 | 1 | 1879 |
| 698 | Pakistan Vs England | England | 7 | Liam Livingstone | 1 | 1 | 0 | 0 | 100.00 | 0 | 1879 |
699 rows × 11 columns
In [14]:
print(fact_batting_summary['out/not_out'].unique())
[1 0]
KPI's Identification¶
In [15]:
from sqlalchemy import create_engine
from urllib.parse import quote
password = quote('Ullekh@123')
DATABASE_URL = f'mysql+pymysql://root:{password}@127.0.0.1/T20WorldCup'
engine = create_engine(DATABASE_URL)
matchdf = pd.read_sql_table('match_summary', engine)
matchdf['margin'] =matchdf['margin'].astype(str).str.strip().str.lower()
matchdf['margin'] =matchdf['margin'].str.title()
matchdf.head()
Out[15]:
| team1 | team2 | winner | margin | ground | matchDate | matchId | |
|---|---|---|---|---|---|---|---|
| 0 | Namibia | Sri Lanka | Namibia | 55 Runs | Geelong | 2022-10-16 | 1823 |
| 1 | Netherlands | U.A.E. | Netherlands | 3 Wickets | Geelong | 2022-10-16 | 1825 |
| 2 | Scotland | West Indies | Scotland | 42 Runs | Hobart | 2022-10-17 | 1826 |
| 3 | Ireland | Zimbabwe | Zimbabwe | 31 Runs | Hobart | 2022-10-17 | 1828 |
| 4 | Namibia | Netherlands | Netherlands | 5 Wickets | Geelong | 2022-10-18 | 1830 |
In [16]:
battingdf=pd.read_sql_table('batting_summary',engine)
battingdf.head()
Out[16]:
| match | teamInnings | battingPos | batsmanName | runs | balls | 4s | 6s | SR | out/not_out | matchId | playerId | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Namibia Vs Sri Lanka | Namibia | 1 | Michael Van Lingen | 3 | 6 | 0 | 0 | 50.00 | 1 | 1823 | 128 |
| 1 | Namibia Vs Sri Lanka | Namibia | 2 | Divan La Cock | 9 | 9 | 1 | 0 | 100.00 | 1 | 1823 | 129 |
| 2 | Namibia Vs Sri Lanka | Namibia | 3 | Jan Nicol Loftie-Eaton | 20 | 12 | 1 | 2 | 166.66 | 1 | 1823 | 131 |
| 3 | Namibia Vs Sri Lanka | Namibia | 4 | Stephan Baard | 26 | 24 | 2 | 0 | 108.33 | 1 | 1823 | 130 |
| 4 | Namibia Vs Sri Lanka | Namibia | 5 | Gerhard Erasmus | 20 | 24 | 0 | 0 | 83.33 | 1 | 1823 | 133 |
In [17]:
bowlingdf=pd.read_sql_table('bowling_summary',engine)
bowlingdf.head()
Out[17]:
| match | bowlingTeam | bowlerName | overs | maiden | runs | wickets | economy | 0s | 4s | 6s | wides | noBalls | matchId | extras | total_balls_bowled | playerId | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Namibia Vs Sri Lanka | Sri Lanka | Maheesh Theekshana | 4.0 | 0 | 23 | 1 | 5.75 | 7 | 0 | 0 | 2 | 0 | 1823 | 2 | 24 | 73 |
| 1 | Namibia Vs Sri Lanka | Sri Lanka | Dushmantha Chameera | 4.0 | 0 | 39 | 1 | 9.75 | 6 | 3 | 1 | 2 | 0 | 1823 | 2 | 24 | 192 |
| 2 | Namibia Vs Sri Lanka | Sri Lanka | Pramod Madushan | 4.0 | 0 | 37 | 2 | 9.25 | 6 | 3 | 1 | 0 | 0 | 1823 | 0 | 24 | 71 |
| 3 | Namibia Vs Sri Lanka | Sri Lanka | Chamika Karunaratne | 4.0 | 0 | 36 | 1 | 9.00 | 7 | 3 | 1 | 1 | 0 | 1823 | 1 | 24 | 164 |
| 4 | Namibia Vs Sri Lanka | Sri Lanka | Wanindu Hasaranga De Silva | 4.0 | 0 | 27 | 1 | 6.75 | 8 | 1 | 1 | 0 | 0 | 1823 | 0 | 24 | 74 |
In [18]:
playersdf=pd.read_sql_table('infoplayers',engine)
playersdf.head()
Out[18]:
| name | team | battingStyle | bowlingStyle | playingRole | playerId | |
|---|---|---|---|---|---|---|
| 0 | Najmul Hossain Shanto | Bangladesh | Left Hand Bat | Right Arm Offbreak | Top Order Batter | 1 |
| 1 | Soumya Sarkar | Bangladesh | Left Hand Bat | Right Arm Medium Fast | Middle Order Batter | 2 |
| 2 | Litton Das | Bangladesh | Right Hand Bat | None | Wicketkeeper Batter | 3 |
| 3 | Shakib Al Hasan | Bangladesh | Left Hand Bat | Slow Left Arm Orthodox | Allrounder | 4 |
| 4 | Afif Hossain | Bangladesh | Left Hand Bat | Right Arm Offbreak | Allrounder | 5 |
Overall Tournament KPIs¶
In [19]:
import pandas as pd
queries = {
"Total Matches": """
SELECT COUNT(DISTINCT matchId) as Total_Matches
FROM matchdf;
""",
"Total Runs Scored": """
SELECT SUM(runs) AS total_runs_scored
FROM battingdf;
""",
"Total Wickets Taken": """
SELECT SUM(wickets) AS total_wickets_taken
FROM bowlingdf;
""",
"Total Boundaries": """
SELECT SUM("4s") + SUM("6s") AS Total_boundaries
FROM battingdf;
""",
"Average Runs Per Innings": """
SELECT ROUND(AVG(total_runs), 2) AS avg_runs_per_innings
FROM (
SELECT matchId, teamInnings, SUM(runs) AS total_runs
FROM battingdf
GROUP BY matchId, teamInnings
) AS innings_runs;
""",
"Highest Run Scorer": """
SELECT batsmanName,
SUM(runs) AS total_runs
FROM battingdf
GROUP BY batsmanName
ORDER BY total_runs DESC
LIMIT 1;
""",
"Highest Wicket Taker": """
SELECT bowlerName,
SUM(wickets) AS total_wickets
FROM bowlingdf
GROUP BY bowlerName
ORDER BY total_wickets DESC
LIMIT 1;
""",
"Highest Individual Score": """
SELECT batsmanName, MAX(runs) AS highest_individual_score, matchId
FROM battingdf
GROUP BY batsmanName, matchId
ORDER BY highest_individual_score DESC
LIMIT 1;
""",
"Highest Individual Wickets": """
SELECT bowlerName, MAX(wickets) AS highest_individual_wickets, runs AS runs_given, matchId
FROM bowlingdf
GROUP BY bowlerName, runs, matchId
ORDER BY highest_individual_wickets DESC
LIMIT 1;
"""
}
results = {}
for kpi_name, query in queries.items():
df = mysql(query)
if kpi_name in ["Highest Run Scorer", "Highest Wicket Taker", "Highest Individual Score", "Highest Individual Wickets"]:
if kpi_name == "Highest Run Scorer":
results[kpi_name] = f"{df.iloc[0]['batsmanName']} ({df.iloc[0]['total_runs']})"
elif kpi_name == "Highest Wicket Taker":
results[kpi_name] = f"{df.iloc[0]['bowlerName']} ({df.iloc[0]['total_wickets']})"
elif kpi_name == "Highest Individual Score":
results[kpi_name] = f"{df.iloc[0]['batsmanName']} ({df.iloc[0]['highest_individual_score']})"
elif kpi_name == "Highest Individual Wickets":
results[kpi_name] = f"{df.iloc[0]['bowlerName']} ({df.iloc[0]['highest_individual_wickets']})"
else:
results[kpi_name] = df.iloc[0, 0]
kpi_df = pd.DataFrame(list(results.items()), columns=["KPI", "Value"])
kpi_df
Out[19]:
| KPI | Value | |
|---|---|---|
| 0 | Total Matches | 45 |
| 1 | Total Runs Scored | 11169 |
| 2 | Total Wickets Taken | 515 |
| 3 | Total Boundaries | 1240 |
| 4 | Average Runs Per Innings | 132.96 |
| 5 | Highest Run Scorer | Virat Kohli (296) |
| 6 | Highest Wicket Taker | Wanindu Hasaranga De Silva (15) |
| 7 | Highest Individual Score | Rilee Rossouw (109) |
| 8 | Highest Individual Wickets | Sam Curran (5) |
Team KPI's¶
In [20]:
import pandas as pd
team_name = 'India'
queries = {
"Total Matches": f"""
SELECT COUNT(DISTINCT matchId) AS Total_Matches
FROM matchdf
WHERE team1 = '{team_name}' OR team2='{team_name}';
""",
"Win/Loss Ratio": f"""
SELECT
ROUND(SUM(wins) * 1.0 / SUM(total_matches) * 100, 2) AS win_loss_ratio
FROM (
SELECT
SUM(CASE WHEN winner = team1 THEN 1 ELSE 0 END) AS wins,
COUNT(*) AS total_matches
FROM matchdf
WHERE team1 = '{team_name}'
UNION ALL
SELECT
SUM(CASE WHEN winner = team2 THEN 1 ELSE 0 END) AS wins,
COUNT(*) AS total_matches
FROM matchdf
WHERE team2 = '{team_name}'
) AS combined
""",
"Total Runs Scored": f"""
SELECT SUM(runs) AS total_runs_scored
FROM battingdf
WHERE teamInnings = '{team_name}';
""",
"Total Wickets Taken": f"""
SELECT SUM(wickets) AS total_wickets_taken
FROM bowlingdf
WHERE bowlingTeam = '{team_name}';
""",
"Total Boundaries": f"""
SELECT SUM("4s") + SUM("6s") AS Total_boundaries
FROM battingdf
WHERE teamInnings = '{team_name}';
""",
"AVG Runs Per Innings": f"""
SELECT ROUND(AVG(total_runs), 2) AS avg_runs_per_innings
FROM (
SELECT matchId, teamInnings, SUM(runs) AS total_runs
FROM battingdf
WHERE teamInnings = '{team_name}'
GROUP BY matchId, teamInnings
) AS innings_runs;
""",
"AVG Economy Rate": f"""
SELECT ROUND(SUM(runs) * 1.0 / NULLIF(SUM(overs), 0), 2) AS economy_rate
FROM bowlingdf
WHERE bowlingTeam = '{team_name}';
""",
"Highest Run Scorer": f"""
SELECT batsmanName,
SUM(runs) AS total_runs
FROM battingdf
WHERE teamInnings = '{team_name}'
GROUP BY batsmanName
ORDER BY total_runs DESC
LIMIT 1;
""",
"Highest Wicket Taker": f"""
SELECT bowlerName,
SUM(wickets) AS total_wickets
FROM bowlingdf
WHERE bowlingTeam = '{team_name}'
GROUP BY bowlerName
ORDER BY total_wickets DESC
LIMIT 1;
"""
}
results = {}
for kpi_name, query in queries.items():
df = mysql(query)
if kpi_name in ["Highest Run Scorer", "Highest Wicket Taker"]:
if not df.empty:
if kpi_name == "Highest Run Scorer":
results[kpi_name] = f"{df.iloc[0]['batsmanName']} ({df.iloc[0]['total_runs']})"
elif kpi_name == "Highest Wicket Taker":
results[kpi_name] = f"{df.iloc[0]['bowlerName']} ({df.iloc[0]['total_wickets']})"
else:
results[kpi_name] = df.iloc[0, 0] if not df.empty else None
team_kpi = pd.DataFrame(list(results.items()), columns=["KPI", "Value"])
team_kpi
Out[20]:
| KPI | Value | |
|---|---|---|
| 0 | Total Matches | 6 |
| 1 | Win/Loss Ratio | 66.67 |
| 2 | Total Runs Scored | 966 |
| 3 | Total Wickets Taken | 37 |
| 4 | Total Boundaries | 118 |
| 5 | AVG Runs Per Innings | 161.0 |
| 6 | AVG Economy Rate | 7.64 |
| 7 | Highest Run Scorer | Virat Kohli (296) |
| 8 | Highest Wicket Taker | Arshdeep Singh (10) |
Players KPI's¶
In [21]:
def generate_player_kpis(player_name):
queries = {
"Batter": {
"Total Runs Scored": """
SELECT SUM(b.runs) AS total_runs_scored
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Batting Average": """
SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(COUNT(DISTINCT CASE WHEN b.`out/not_out` = 1 THEN b.matchId END), 0) AS batting_average
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Average Balls Faced": """
SELECT CAST(SUM(b.balls) AS FLOAT) / NULLIF(COUNT(DISTINCT b.matchId), 0) AS avg_balls_faced
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"AVG StrikeRate":"""
SELECT AVG(SR) AS average_strike_rate
FROM battingdf
WHERE batsmanName='{player_name}'
""",
"Boundary Percentage": """
SELECT (CAST(SUM(b.`4s`) + SUM(b.`6s`) AS FLOAT) / NULLIF(SUM(b.balls), 0)) * 100 AS boundary_percentage
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Boundaries Hit Per Match": """
SELECT (CAST(SUM(b.`4s`) + SUM(b.`6s`) AS FLOAT) / NULLIF(COUNT(DISTINCT b.matchId), 0)) AS boundaries_per_match
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Boundary Frequency (balls per boundary)": """
SELECT (SUM(b.balls) / NULLIF(SUM(b.`4s` + b.`6s`), 0)) AS boundary_frequency
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
"""
},
"Bowler": {
"Total Wickets Taken": """
SELECT SUM(b.wickets) AS total_wickets_taken
FROM bowlingdf b
JOIN playersdf p ON b.bowlerName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Bowling Average": """
SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(SUM(b.wickets), 0) AS bowling_average
FROM bowlingdf b
JOIN playersdf p ON b.bowlerName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Economy Rate": """
SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(SUM(b.total_balls_bowled) / 6, 0) AS economy_rate
FROM bowlingdf b
JOIN playersdf p ON b.bowlerName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Bowling Strike Rate": """
SELECT CAST(SUM(b.total_balls_bowled) AS FLOAT) / NULLIF(SUM(b.wickets), 0) AS bowling_strike_rate
FROM bowlingdf b
JOIN playersdf p ON b.bowlerName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Dot Balls Bowled Percentage": """
SELECT (CAST(SUM(b.`0s`) AS FLOAT) / CAST(SUM(b.total_balls_bowled) AS FLOAT)) * 100 AS dot_balls_bowled_percentage
FROM bowlingdf b
JOIN playersdf p ON b.bowlerName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
"""
},
"Allrounder": {
"Total Runs Scored": """
SELECT SUM(b.runs) AS total_runs_scored
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Batting Average": """
SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(COUNT(DISTINCT CASE WHEN b.`out/not_out` = 1 THEN b.matchId END), 0) AS batting_average
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Average Balls Faced": """
SELECT CAST(SUM(b.balls) AS FLOAT) / NULLIF(COUNT(DISTINCT b.matchId), 0) AS avg_balls_faced
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Boundary Percentage": """
SELECT (CAST(SUM(b.`4s`) + SUM(b.`6s`) AS FLOAT) / NULLIF(SUM(b.balls), 0)) * 100 AS boundary_percentage
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Boundaries Hit Per Match": """
SELECT (CAST(SUM(b.`4s`) + SUM(b.`6s`) AS FLOAT) / NULLIF(COUNT(DISTINCT b.matchId), 0)) AS boundaries_per_match
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Boundary Frequency (balls per boundary)": """
SELECT (SUM(b.balls) / NULLIF(SUM(b.`4s` + b.`6s`), 0)) AS boundary_frequency
FROM battingdf b
JOIN playersdf p ON b.batsmanName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Total Wickets Taken": """
SELECT SUM(b.wickets) AS total_wickets_taken
FROM bowlingdf b
JOIN playersdf p ON b.bowlerName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Bowling Average": """
SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(SUM(b.wickets), 0) AS bowling_average
FROM bowlingdf b
JOIN playersdf p ON b.bowlerName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Economy Rate": """
SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(SUM(b.total_balls_bowled) / 6, 0) AS economy_rate
FROM bowlingdf b
JOIN playersdf p ON b.bowlerName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Bowling Strike Rate": """
SELECT CAST(SUM(b.total_balls_bowled) AS FLOAT) / NULLIF(SUM(b.wickets), 0) AS bowling_strike_rate
FROM bowlingdf b
JOIN playersdf p ON b.bowlerName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
""",
"Dot Balls Bowled Percentage": """
SELECT (CAST(SUM(b.`0s`) AS FLOAT) / CAST(SUM(b.total_balls_bowled) AS FLOAT)) * 100 AS dot_balls_bowled_percentage
FROM bowlingdf b
JOIN playersdf p ON b.bowlerName = p.name
WHERE p.name = '{player_name}'
GROUP BY p.name;
"""
}
}
role_query = f"""
SELECT playingRole AS playingRole
FROM playersdf
WHERE name='{player_name}'
""".format(player_name=player_name)
role_df = mysql(role_query)
if role_df.empty:
print(f"No role found for player: {player_name}")
return []
role = role_df.iloc[0]['playingRole']
print(f"Role retrieved for {player_name}: {role}")
if role in ['Top Order Batter', 'Middle Order Batter', 'Wicketkeeper Batter', 'Opening Batter', 'Batter']:
kpi_queries = queries["Batter"]
elif role in ['Bowler']:
kpi_queries = queries["Bowler"]
elif role in ['Allrounder', 'Batting Allrounder', 'Bowling Allrounder']:
kpi_queries = queries["Allrounder"]
else:
print(f"Unrecognized role: {role}")
return []
kpi_data = []
for kpi_name, query in kpi_queries.items():
formatted_query = query.format(player_name=player_name)
kpi_df = mysql(formatted_query)
if not kpi_df.empty:
kpi_value = kpi_df.iloc[0].values[0]
kpi_value = round(kpi_value, 2)
kpi_data.append({"KPI": kpi_name, "Value": kpi_value})
else:
print(f"No KPI data found for {kpi_name} for player: {player_name}")
kpi_df = pd.DataFrame(kpi_data)
return kpi_df
player_name = 'Sam Curran'
kpi_df = generate_player_kpis(player_name)
kpi_df
Role retrieved for Sam Curran: Allrounder
Out[21]:
| KPI | Value | |
|---|---|---|
| 0 | Total Runs Scored | 12.00 |
| 1 | Batting Average | 12.00 |
| 2 | Average Balls Faced | 7.00 |
| 3 | Boundary Percentage | 7.14 |
| 4 | Boundaries Hit Per Match | 0.50 |
| 5 | Boundary Frequency (balls per boundary) | 14.00 |
| 6 | Total Wickets Taken | 13.00 |
| 7 | Bowling Average | 11.38 |
| 8 | Economy Rate | 6.73 |
| 9 | Bowling Strike Rate | 10.38 |
| 10 | Dot Balls Bowled Percentage | 48.89 |
Visualization¶
In [22]:
import plotly.express as px
import plotly.graph_objects as go
team_colors = {
'Afghanistan': 'green',
'Australia': 'yellow',
'Bangladesh': '#006400',
'England': '#00247D',
'India': '#0099FF',
'Ireland': '#009B77',
'Namibia': '#0033A0',
'Netherlands': 'darkOrange',
'New Zealand': '#000000',
'Pakistan': '#004B49',
'Scotland': '#0033A0',
'South Africa': '#008C8C',
'Sri Lanka': '#FFD700',
'U.A.E.': 'chocolate',
'West Indies': '#8A2C2D',
'Zimbabwe': 'tomato'
}
def h_bar_plot(df, x_column, y_column, title, x_title, y_title):
fig = px.bar(df, x=x_column, y=y_column, color=y_column, text=df[x_column],
title=title, labels={x_column: x_title, y_column: y_title},
color_discrete_map=team_colors, orientation='h')
fig.update_traces(marker_line_width=1.5, opacity=0.8,
textposition='outside')
fig.update_layout(title_font_size=24, showlegend=True)
fig.show()
def bar_plot(df, x_column, y_column, title, x_title, y_title):
fig = px.bar(df, x=x_column, y=y_column, color=x_column,text=df[y_column],
title=title, labels={x_column: x_title, y_column: y_title},
color_discrete_map=team_colors)
fig.update_traces(marker_line_width=1.5, opacity=0.8, textposition='outside')
fig.update_layout(title_font_size=24,showlegend=True)
fig.show()
def create_scatter_plot(df, x_column, y_column, title, x_title, y_title, size_column=None):
if size_column:
fig = px.scatter(
df,
x=x_column,
y=y_column,
color=x_column,
size=size_column,
title=title,
labels={x_column: x_title, y_column: y_title},
color_discrete_map=team_colors
)
else:
fig = go.Figure()
for team, color in team_colors.items():
team_data = df[df['team'] == team]
if not team_data.empty:
hover_text = team_data.apply(lambda row: f'Team: {team}<br>{x_title}: {row[x_column]:.2f}<br>{y_title}: {row[y_column]:.2f}', axis=1)
fig.add_trace(go.Scatter(
x=team_data[x_column],
y=team_data[y_column],
mode='markers',
marker=dict(
size=20,
color=color,
opacity=1,
line=dict(width=2, color=color)
),
hovertext=hover_text,
hoverinfo='text',
name=team,
showlegend=True
))
fig.update_layout(title=title, title_font_size=24, xaxis_title=x_title, yaxis_title=y_title, showlegend=True)
fig.show()
In [23]:
q="""SELECT b.batsmanName as BatsmanName,MAX(b.runs) AS "Top Individual Scores",
m.matchId as MatchId,DATE(m.matchDate)as MatchDate,
m.ground as Ground,
CASE
WHEN teamInnings = m.team1 THEN m.team2
WHEN teamInnings= m.team2 THEN m.team1
END AS VS
FROM battingdf b
JOIN matchdf m ON b.matchId = m.matchId
GROUP BY b.batsmanName, m.matchId, m.matchDate, m.ground, VS
ORDER BY "Top individual scores" DESC
LIMIT 10;
"""
q1=mysql(q)
q1
Out[23]:
| BatsmanName | Top Individual Scores | MatchId | MatchDate | Ground | VS | |
|---|---|---|---|---|---|---|
| 0 | Rilee Rossouw | 109 | 1847 | 2022-10-27 | Sydney | Bangladesh |
| 1 | Glenn Phillips | 104 | 1850 | 2022-10-29 | Sydney | Sri Lanka |
| 2 | Devon Conway | 92 | 1839 | 2022-10-22 | Sydney | Australia |
| 3 | Alex Hales | 86 | 1878 | 2022-11-10 | Adelaide | India |
| 4 | Michael Jones | 86 | 1833 | 2022-10-19 | Hobart | Ireland |
| 5 | Sikandar Raza | 82 | 1828 | 2022-10-17 | Hobart | Ireland |
| 6 | Virat Kohli | 82 | 1842 | 2022-10-23 | Melbourne | Pakistan |
| 7 | Jos Buttler | 80 | 1878 | 2022-11-10 | Adelaide | India |
| 8 | Kusal Mendis | 79 | 1835 | 2022-10-20 | Geelong | Netherlands |
| 9 | Pathum Nissanka | 74 | 1832 | 2022-10-18 | Geelong | U.A.E. |
In [24]:
q="""SELECT b.bowlerName AS "Bowler Name",
MAX(b.wickets) AS "Top Individual Wickets",
b.runs AS "Runs Given",
m.matchId AS "MatchId",
DATE(m.matchDate) AS "MatchDate",
m.ground AS "Ground",
CASE
WHEN b.bowlingTeam = m.team1 THEN m.team2
WHEN b.bowlingTeam = m.team2 THEN m.team1
ELSE NULL
END AS "VS"
FROM bowlingdf b
JOIN matchdf m ON b.matchId = m.matchId
GROUP BY b.bowlerName, m.matchId, DATE(m.matchDate), m.ground, "VS"
ORDER BY "Top Individual Wickets" DESC
LIMIT 10;
"""
q2=mysql(q)
q2
Out[24]:
| Bowler Name | Top Individual Wickets | Runs Given | MatchId | MatchDate | Ground | VS | |
|---|---|---|---|---|---|---|---|
| 0 | Sam Curran | 5 | 10 | 1840 | 2022-10-22 | Perth | Afghanistan |
| 1 | Alzarri Joseph | 4 | 16 | 1834 | 2022-10-19 | Hobart | Zimbabwe |
| 2 | Anrich Nortje | 4 | 10 | 1847 | 2022-10-27 | Sydney | Bangladesh |
| 3 | Anrich Nortje | 4 | 41 | 1861 | 2022-11-03 | Sydney | Pakistan |
| 4 | Lungi Ngidi | 4 | 29 | 1853 | 2022-10-30 | Perth | India |
| 5 | Mohammad Wasim | 4 | 24 | 1849 | 2022-10-27 | Perth | Zimbabwe |
| 6 | Shaheen Shah Afridi | 4 | 22 | 1872 | 2022-11-06 | Adelaide | Bangladesh |
| 7 | Taskin Ahmed | 4 | 25 | 1843 | 2022-10-24 | Hobart | Netherlands |
| 8 | Trent Boult | 4 | 13 | 1850 | 2022-10-29 | Sydney | Sri Lanka |
| 9 | Arshdeep Singh | 3 | 32 | 1842 | 2022-10-23 | Melbourne | Pakistan |
In [25]:
q = """
SELECT
CASE
WHEN margin LIKE '%runs%' THEN 'Batting First(Defending)'
WHEN margin LIKE '%wickets%' THEN 'Bowling First(Chasing)'
END AS winning_method,
COUNT(*) AS win_count
FROM matchdf
WHERE margin LIKE '%runs%' OR margin LIKE '%wickets%'
GROUP BY winning_method;
"""
q3 = mysql(q)
fig = px.pie(
q3,
names='winning_method',
values='win_count',
title='Winning Strategies: Batting First vs Bowling First',
color_discrete_sequence=['skyblue', 'lightgreen']
)
fig.update_layout(title_font_size=24)
fig.show()
In [26]:
q = """
SELECT winner AS team,
COUNT(*) AS win_count
FROM matchdf
WHERE winner IS NOT NULL AND winner != 'No Result'
GROUP BY team
ORDER BY win_count DESC;
"""
q4 = mysql(q)
h_bar_plot(q4, 'win_count', 'team', 'Number of Wins by Team', 'Number of Wins', 'Team')
In [27]:
q="""WITH BattingCategory AS (
SELECT
teamInnings,
CASE
WHEN battingPos IN (1, 2) THEN 'Openers'
WHEN battingPos IN (3, 4, 5) THEN 'Middle Order'
WHEN battingPos IN (6, 7) THEN 'Lower Middle Order'
ELSE 'Lower Order'
END AS "Batting Category",
runs,
balls,
"4s",
"6s"
FROM
battingdf
),
Metrics AS (
SELECT
"Batting Category",
ROUND(SUM(runs), 2) AS "Total Runs",
ROUND(AVG(runs), 2) AS "Average Runs",
ROUND(SUM(balls), 2) AS "Total Balls Faced",
ROUND((SUM("4s") + SUM("6s")) * 100.0 / SUM(balls), 2) AS "Boundary Percentage"
FROM BattingCategory
GROUP BY "Batting Category"
)
SELECT
"Batting Category",
"Total Runs",
"Average Runs",
ROUND("Total Runs" * 100.0 / NULLIF("Total Balls Faced", 0), 2) AS "AVG Strike Rate",
"Total Balls Faced",
"Boundary Percentage"
FROM Metrics
ORDER BY CASE "Batting Category"
WHEN 'Openers' THEN 1
WHEN 'Middle Order' THEN 2
WHEN 'Lower Middle Order' THEN 3
WHEN 'Lower Order' THEN 4
END;
"""
q5=mysql(q)
q5
create_scatter_plot(q5, 'Batting Category',
'AVG Strike Rate',
size_column='Average Runs',
title="Batting Performance by Category",
x_title="Batting Category",
y_title="Average Strike Rate")
In [28]:
q="""SELECT teamInnings as Team, SUM(runs) AS total_runs
FROM battingdf
GROUP BY Team
ORDER BY total_runs DESC;
"""
q6=mysql(q)
bar_plot(q6, 'Team', 'total_runs', 'Total Runs Scored by Each Team', 'Team', 'Total Runs')
In [29]:
q = """
SELECT teamInnings as Team, ROUND(AVG(total_runs),2) AS "Avg Runs Per Match"
FROM (
SELECT teamInnings, matchId, SUM(runs) AS total_runs
FROM battingdf
GROUP BY teamInnings, matchId
) AS match_runs
GROUP BY Team
ORDER BY "Avg Runs Per Match" DESC;
"""
q7 = mysql(q)
bar_plot(q7, 'Team', 'Avg Runs Per Match', 'AVG Runs Scored Per Match by Each Team', 'Team', 'AVG Runs')
In [30]:
q= """SELECT team,ROUND(SUM(wins) * 1.0 / SUM(total_matches) * 100 ,2) AS "Win Loss Ratio"
FROM(
SELECT team1 AS team,
SUM(CASE WHEN winner = team1 THEN 1 ELSE 0 END) AS wins,
COUNT(*) AS total_matches
FROM matchdf
GROUP BY team1
UNION ALL
SELECT team2 AS team,
SUM(CASE WHEN winner = team2 THEN 1 ELSE 0 END) AS wins,
COUNT(*) AS total_matches
FROM matchdf
GROUP BY team2
) AS combined
GROUP BY team
ORDER BY "Win Loss Ratio" DESC;
"""
q8=mysql(q)
h_bar_plot(q8, 'Win Loss Ratio', 'team', 'Wins Loss Ratio of Each Team(Percentage)', 'Win/Loss Ratio ', 'Team')
In [31]:
q= """
SELECT bowlingTeam AS team,
SUM(wickets) AS "Total Wickets"
FROM bowlingdf
GROUP BY bowlingTeam
ORDER BY "Total Wickets" DESC;
"""
q9=mysql(q)
h_bar_plot(q9, 'Total Wickets', 'team', 'Total Wickets Taken by Each Team', 'Total Wickets ', 'Team')
In [32]:
q= """SELECT bowlingTeam AS Team,
ROUND(SUM(runs) * 1.0 / SUM(overs),2) AS "Economy Rate"
FROM bowlingdf
GROUP BY Team
ORDER BY "Economy Rate" DESC;
"""
q10=mysql(q)
bar_plot(q10, 'Team', 'Economy Rate', 'AVG Economy Rate by Each Team', 'Team', 'Economy Rate')
In [33]:
q="""
SELECT teamInnings AS Team,
ROUND((SUM(runs) * 1.0 / SUM(balls)) * 100,2) AS "Strike Rate"
FROM battingdf
GROUP BY Team
ORDER BY "Strike Rate" DESC;
"""
q11=mysql(q)
bar_plot(q11, 'Team', 'Strike Rate', 'AVG Batting Strike Rate by Each Team', 'Team', 'Batting Strike Rate')
In [34]:
q= """
SELECT bowlingTeam AS Team,
ROUND((SUM(total_balls_bowled) * 1.0 / SUM(wickets)),2) AS "Bowling Strike Rate"
FROM bowlingdf
GROUP BY Team
ORDER BY "Bowling Strike Rate" DESC;
"""
q12=mysql(q)
bar_plot(q12, 'Team','Bowling Strike Rate', 'AVG Bowling Strike Rate by Each Team', 'Team', 'Bowling Strike Rate')
In [35]:
q="""WITH BattingDetails AS (
SELECT
b.teamInnings AS batting_team,
bs.matchId,
bs."0s",
bs.total_balls_bowled,
bs.wides,
bs.noBalls
FROM
battingdf b
JOIN
bowlingdf bs
ON
b.matchId = bs.matchId
AND b.teamInnings = CASE
WHEN bs.bowlingTeam = m.team1 THEN m.team2
WHEN bs.bowlingTeam = m.team2 THEN m.team1
END
JOIN
matchdf m
ON
bs.matchId = m.matchId
)
SELECT
b.batting_team as Team,
SUM(b."0s") AS total_dot_balls_faced,
SUM(b.total_balls_bowled - b.wides - b.noBalls) AS total_balls_faced,
ROUND((SUM(b."0s") * 100.0) / SUM(b.total_balls_bowled - b.wides - b.noBalls),2) AS "Dot Ball Percentage"
FROM BattingDetails b
GROUP BY Team
ORDER BY "Dot Ball Percentage" DESC;
"""
q13=mysql(q)
bar_plot(q13, 'Team','Dot Ball Percentage', 'Dot Ball Percentage by Each Team(Batting)', 'Team', 'Dot Ball Percentage ')
In [36]:
q="""
SELECT bowlingTeam AS Team,
SUM(extras) / COUNT(DISTINCT matchId) AS "Avg Extras Per Match"
FROM bowlingdf
GROUP BY bowlingTeam
ORDER BY "Avg Extras Per Match" DESC;
"""
q14=mysql(q)
bar_plot(q14, 'Team','Avg Extras Per Match', 'Avg Extras Conceded Per Match by Each Team', 'Team', 'Extras Conceded')
In [37]:
q="""SELECT
teamInnings AS team,
ROUND(AVG(runs),2) AS "Average Runs",
SUM(balls) AS total_balls_faced,
(SUM(runs) * 100.0 / NULLIF(SUM(balls), 0)) AS strike_rate
FROM battingdf
WHERE battingPos IN (1, 2)
GROUP BY team;
"""
q15=mysql(q)
create_scatter_plot(q15, 'Average Runs', 'strike_rate', 'Openers AVG Runs vs Strike Rate by Team(Batting Position 1&2)', 'Average Runs', 'Strike Rate')
In [38]:
q="""SELECT
teamInnings AS team,
AVG(runs) AS "Average Runs",
SUM(balls) AS total_balls_faced,
(SUM(runs) * 100.0 / NULLIF(SUM(balls), 0)) AS strike_rate
FROM battingdf
WHERE battingPos IN (3,4,5)
GROUP BY team;
"""
q16=mysql(q)
create_scatter_plot(q16, 'Average Runs', 'strike_rate', 'Middle Order AVG Runs vs Strike Rate by Team(Batting Position 3,4,5)', 'Average Runs', 'Strike Rate')
In [39]:
q="""SELECT
teamInnings AS team,
AVG(runs) AS "Average Runs",
SUM(balls) AS total_balls_faced,
(SUM(runs) * 100.0 / NULLIF(SUM(balls), 0)) AS strike_rate
FROM battingdf
WHERE battingPos IN (6,7)
GROUP BY team;
"""
q17=mysql(q)
create_scatter_plot(q17, 'Average Runs', 'strike_rate', 'Lower Middle Order AVG Runs vs Strike Rate by Team(Batting Position 6,7)', 'Average Runs', 'Strike Rate')
In [40]:
q="""SELECT batsmanName as PlayerName,
ROUND(CAST(SUM(runs) AS FLOAT) / NULLIF(COUNT(DISTINCT CASE WHEN `out/not_out` = 1 THEN matchId END), 0) ,2)AS "Average Runs",
SUM(balls) AS total_balls_faced,
ROUND((SUM(runs) * 100.0 / NULLIF(SUM(balls), 0)),2)AS strike_rate
FROM battingdf
GROUP BY PlayerName
HAVING SUM(balls) >= 75
ORDER BY "Average Runs" DESC
LIMIT 10;
"""
q18=mysql(q)
fig = go.Figure()
fig = px.scatter(q18,
x='Average Runs',
y='strike_rate',
size='Average Runs',
color='PlayerName',
title='Top 10 Batters: AVG Runs vs AVG SR(Minimum 75 balls Faced)',
labels={'Average Runs': 'Average Runs ', 'strike_rate': 'Strike Rate'})
fig.update_traces( marker=dict( opacity=1))
fig.update_layout(title_font_size=24,showlegend=True)
fig.update_layout(
xaxis_title='Average Runs',
yaxis_title='Strike Rate')
In [41]:
q="""SELECT
BowlerName AS PlayerName,
SUM(wickets) AS total_wickets,
SUM(runs) AS total_runs_conceded,
SUM(total_balls_bowled) AS total_balls_bowled,
ROUND(CAST(SUM(runs) AS FLOAT) / NULLIF(SUM(total_balls_bowled)/ 6, 0),2) AS economy_rate
FROM bowlingdf
GROUP BY PlayerName
HAVING SUM(total_balls_bowled) >= 50
ORDER BY total_wickets DESC
LIMIT 10;
"""
kpi=mysql(q)
fig = px.scatter(kpi,
x='total_wickets',
y='economy_rate',
size='total_wickets',
color='PlayerName',
title='Top 10 Bowlers: Total Wickets vs Economy Rate',
labels={'total_wickets': 'Total Wickets', 'economy_rate': 'Economy Rate'})
fig.update_traces( marker=dict( opacity=1))
fig.update_layout(title_font_size=24,showlegend=True)
fig.update_layout(
xaxis_title='Total Wickets',
yaxis_title='Economy Rate')
fig.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: